knitr::opts_chunk$set(message=FALSE, warning=FALSE, cache=TRUE)
library(tidyverse)
library(covidcast)
# Fetch the following sources and signals from the API
# TODO: Add Google Symptoms "eventually"
source_names = c("doctor-visits", "fb-survey", "fb-survey", "hospital-admissions")
signal_names = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli",
"smoothed_adj_covid19")
pretty_names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community",
"Hospitalizations")
target_names = c("Cases", "Cases", "Cases", "Deaths")
geo_level = "county"
start_day = "2020-04-15"
end_day = NULL
cache_fname = 'cached_data/03_heterogeneity_core_indicators.RDS'
if (!file.exists(cache_fname)) {
df_signals = vector("list", length(signal_names))
for (i in 1:length(signal_names)) {
df_signals[[i]] = suppressWarnings(
covidcast_signal(source_names[i], signal_names[i],
start_day, end_day,
geo_type=geo_level))
}
# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = suppressWarnings(
covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
start_day, end_day,
geo_type=geo_level))
df_deaths = suppressWarnings(
covidcast_signal("usa-facts", "deaths_7dav_incidence_prop",
start_day, end_day,
geo_type=geo_level))
case_num = 500
geo_values = suppressWarnings(covidcast_signal("usa-facts", "confirmed_cumulative_num",
max(df_cases$time_value),
max(df_cases$time_value))) %>%
filter(value >= case_num) %>% pull(geo_value)
saveRDS(list(df_signals, df_cases, df_deaths), cache_fname)
} else {
cached_data = readRDS(cache_fname)
df_signals = cached_data[[1]]
df_cases = cached_data[[2]]
df_deaths = cached_data[[3]]
}
The purpose of this note is to give a first look at heterogeneity (spatial and temporal) relationships between our indicators and the quantities that we wish to predict – usually, case rates, or death rates.
Heterogeneity in a relationship between two variables \(x\) and \(y\) means that their relationship “varies”. For our purposes, this can mean that the relationship varies in time (temporal heterogeneity) or it varies across locations (spatial heterogeneity).
Aaron first discovered heterogeneity in the doctor visits indicator, which inspired this DAP. The idea of sensorization, which we apply in this DAP to account for heterogeneity, has a long history in Delphi; see David’s thesis and Maria’s paper.
In the following plots, we see both spatial heterogeneity, in which two counties exhibit the relationship between doctors visits and case rate different; as well as temporal heterogeneity, where, within a single county, the relationship between doctors visits and case rate evolves with time.
dv = tibble(df_signals[[1]])
cases = tibble(df_cases)
county_tibble = covidcast::county_geo %>% transmute (
geo_value = fips,
county_name=county,
state=abbr,
county_name_fips = sprintf('FIPS: %s\n%s, %s',
geo_value, county_name, state),
)
dv_cases = inner_join(
df_signals[[1]], cases, by=c('geo_value', 'time_value')
) %>% select (
geo_value=geo_value,
time_value=time_value,
indicator_value=value.x,
target_value=value.y,
) %>% inner_join (
county_tibble,
on='geo_value',
) %>% tibble
MARICOPA_AZ = '04013'
FRANKLIN_OH = '39049'
FULTON_IL = '17057'
NATRONA_WY = '56025'
BROWN_SD = '46013'
TEMP_HET = MARICOPA_AZ
SPAT_HET = c(FRANKLIN_OH,
# NATRONA_WY,
BROWN_SD)
plt_temporal_heterogeneity = dv_cases %>% filter (
geo_value == TEMP_HET,
) %>% ggplot(
) + geom_point(
shape=21,
colour='black',
aes(
x=indicator_value,
y=target_value,
fill=time_value,
),
) + scale_fill_viridis_c(
trans='date',
) + xlab (
"Doctor visits"
) + ylab (
"Cases per 100k"
) + ggtitle (
"Temporal heterogenenity (Maricopa County, AZ)"
) + theme(legend.position = "bottom"
)
plt_spatial_heterogeneity = dv_cases %>% filter (
geo_value %in% SPAT_HET
) %>% ggplot(
) + geom_point(
shape=21,
colour='black',
aes(
x=indicator_value,
y=target_value,
#fill=geo_value,
fill=county_name_fips,
),
) + xlab (
"Doctor visits"
) + ylab (
"Cases per 100k"
) + ggtitle (
"Spatial heterogeneity"
) + theme(legend.position = "bottom"
)
gridExtra::grid.arrange(
plt_spatial_heterogeneity,
plt_temporal_heterogeneity,
ncol=2)
The full set of plots may be found here.
During one of our modeling meetings, Logan asked whether, for example in Maricopa county above, the observed changed in slope was truly to due to a change in the underlying relationship, or whether it was due to a degradation in our ability to measure case rates (or doctors visits). In essence, if the change in slope was actually due to a degradation in case rate measurement, e.g. because of reduced testing capacity, then perhaps we don’t want to sensorize and correct doctors visits downwards. On Slack, Aaron’s response was that a possible degradation of case rate measurement cannot be the sole explainer of the degradation of correlation between DV and case rate, because other indicators did not see as drastic a fall in their (unsensorized) correlations.
First, we fix notation. Assume an indicator and target (e.g., doctors visits and case rate), which we suppress notationally for brevity. Each observation is then represented as \((x_{t\ell}, y_{t\ell})\), where \(x\) is the indicator value, \(y\) is the target value, \(t\) represents time (measured in dates), and \(\ell\) represents location. Let \(L\) denote the set of all valid locations, e.g., all counties. Let \(x_{t\cdot}\) denote all the \(x_{t\ell}\) collected across locations in \(L\), and similarly for \(y_{t\cdot}\). Let \(x_{t_1:t_2, \ell}\), \(y_{t_1:t_2, \ell}\) denote the observations that fall within times \(t_1, t_2\), endpoints included. Finally, let \(x, y\) be the collection of all observations across time and location.
In the classical, unsensorized approach, we take the \(x_{t\ell}\) and hope that they give some indication of the intensity of \(y_{t\ell}\), e.g., we may compute the Spearman correlation between \(x_{t\cdot}\) and \(y_{t\cdot}\) for each \(t\):
ind_idx=1
base_cor_fname = sprintf('results/03_base_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
df_cor_base = readRDS(base_cor_fname) %>% filter (
Indicator == 'Raw'
)
plt = ggplot(df_cor_base, aes(x = time_value, y = value)) +
geom_line(
#aes(color = Indicator)
) +
labs(title = sprintf("Correlation between %s and %s",
pretty_names[ind_idx],
target_names[ind_idx]),
subtitle = "Per day",
x = "Date", y = "Correlation") +
theme(legend.position = "bottom")
print(plt)
For some reason, \(x\) by itself is “not great” at giving us a sense of how \(y\) is doing. The idea behind sensorization is to replace \(x\) with \(\tilde x\) which does better. We do this by learning specific relationships between \(x\) and \(y\) for each location and time window and using these relationships to “correct” x into \(\tilde x\).
In the basic, spatial-only form of sensorization (as computed in Aaron’s notebook), we ignore the possibility of temporal heterogeneity and learn a single linear relationship between the indicator and target for each location. Specifically, we learn, for each \(\ell \in L\) \[ y_{\cdot\ell} \sim x_{\cdot\ell} \qquad\Rightarrow \mathrm{Model}(\ell) \] and obtain the sensorized indicator values \[ \tilde x_{t\ell} = \texttt{predict}(x_{t\ell}, \mathrm{Model}(\ell)) = \hat y_{t\ell}. \] Note, importantly, that the sensorized \(\tilde x\) is no longer on the same scale as the original indicator \(x\).
Let \(k\) denote the number of days into the past we wish to examine data when fitting our model. (Smaller \(k\) is “more adaptive” to temporal heterogeneity, but may also lead to less stable fits).
We fit, for each \(t, \ell\): \[ y_{(t-k):(t-1), \ell} \sim x_{(t-k):(t-1), \ell} \qquad\Rightarrow \mathrm{Model}(\ell, t, k) \] and obtain the sensorized indicator values \[ \tilde x_{t\ell} = \texttt{predict}(x_{t\ell}, \mathrm{Model}(\ell, t, k)) \] We fit a linear model for each location and day, and then take the prediction for that day. (Huge number of models to fit, but embarrassingly parallelizable).
A wrinkle to our approach is added when we consider the problems of data delay and backfill. For some indicators, data is available “immediately” (usually, this means the morning after the day to be measured) and that version of the data is “final”. This is the ideal case.
However, for many indicators, data is not available until a few days after the date to be measured; and once it becomes available, the data for that date is subsequently updated for several days before it becomes “finalized”. This has bearing on our sensorization analysis, because the historical correlations we obtain by sensorizing using the latest available data can be different from the correlations obtained by sensorizing with contemporaneous data.
We formalize the fact that the data for a single time \(t\) has several versions using a superscript. Let us denote by \(x_{t\ell}^{(t')}\) the indicator value for time \(t\) and location \(\ell\), as reported at time \(t'\). (Similarly, apply this notation for \(y\); and inherit the subscript notation from earlier). When we say \(x_{t\ell}^{(\infty)}\), we mean “the latest available data at time of analysis”. Hence, all methods up until now have assumed \(t' = \infty\).
Recalling \(k\) as the number of days into the past we wish to examine data when fitting our model, and fixing \(\delta\) to be the “delay” at which we sensorize, we fit \[
y_{(t-k):t, \ell}^{(t+\delta)} \sim x_{(t-k):t, \ell}^{(t+\delta)}
\qquad\Rightarrow \mathrm{Model}(\ell, t, k, t+\delta)
\] and obtain the sensorized indicator values \[
\tilde x_{t\ell} = \texttt{predict}(x_{t\ell}, \mathrm{Model}(\ell, t, k, t+\delta))
\] Fitting this model carries the computational annoyance of having to form a new dataset for each \(t\), a task whose burden is greatly lessened by the as_of functionality in our COVIDcast API.
Here we report results for the doctors visits, Facebook %CLI, Facebook %CLI-in-community, and hospitalization indicators. For the first three indicators, the target is cases per 100k; for hospitalization, the target is deaths per 100k.
We compute correlations using the raw (unsensorized) indicator; spatial sensorized, spatial and temporal sensorized. For the latter sensorization, we use several window sizes: up to 7, 10, 14, and 21 days into the past. In the language of “delayed” sensorization, here we use a delay of \(\infty\) (i.e., we always use the most recent data at time of analysis).
In broad terms, we find that sensorizing for both time and space provides the best results; as does using a smaller time window.
sensorize_time_ranges = list(
c(-7, -1),
c(-10, -1),
c(-14, -1),
c(-21, -1))
for (ind_idx in 1:length(source_names)) {
base_cor_fname = sprintf('results/03_base_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
df_cor_base = readRDS(base_cor_fname)
sensorize_fname = sprintf('results/03_sensorize_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
sensorize_cors = readRDS(sensorize_fname)
df_cor = bind_rows(df_cor_base, sensorize_cors)
df_cor$Indicator = factor(df_cor$Indicator,
levels=c('Raw',
'Sensorized (Spatial)',
sapply(sensorize_time_ranges,
function(x) {
sprintf('Sensorized (TS, %d:%d)',
x[[1]], x[[2]])
})))
plt = ggplot(df_cor, aes(x = time_value, y = value)) +
geom_line(aes(color = Indicator)) +
labs(title = sprintf("Correlation between %s and %s",
pretty_names[ind_idx],
target_names[ind_idx]),
subtitle = "Per day",
x = "Date", y = "Correlation") +
theme(legend.position = "bottom")
print(plt)
}
The code for fitting these models can be found here.
Impressively, reducing the delay in obtaining data does not induce a strong degradation in the correlation of the sensorized signal with the target (except for the much lower correlations on days following missing data, which is more common at shorter delays).
Here, we plot the results for a few delays (to avoid overcrowding the plot) as well as the sensorization with delay=infinity, from the previous section. We also restrict our attention to t in -7:-1, since that gave us the best results in the previous section. The full set of results may be found in this notebook.
sensorize_time_ranges = list(
c(-7, -1),
c(-10, -1),
c(-14, -1),
c(-21, -1))
delays = c(3, 7, 10, 14)
time_range = 1
for (ind_idx in 1:length(source_names)) {
sensorize_fname = sprintf('results/03_sensorize_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
sensorize_cors = readRDS(sensorize_fname)
delayed_cors_list = vector('list', length(delays))
for (d_idx in 1:length(delays)) {
sensorize_delayed_fname = sprintf('results/05_sensorize_cors_%s_%s_delay%02d.RDS',
source_names[ind_idx], signal_names[ind_idx],
delays[d_idx])
sensorize_delayed_cors = readRDS(sensorize_delayed_fname)
delayed_cors_list[[d_idx]] = sensorize_delayed_cors[[time_range]]
}
df_cor = bind_rows(sensorize_cors[[time_range]], delayed_cors_list)
df_cor$Indicator = stringr::str_replace(df_cor$Indicator, 'Sensorized ', '')
df_cor$Indicator = factor(df_cor$Indicator,
levels=c(sprintf('(TS, %d:%d)',
sensorize_time_ranges[[time_range]][1],
sensorize_time_ranges[[time_range]][2]),
sapply(delays,
function(x) {
sprintf('(TS, %d:%d; Delay=%02d)',
sensorize_time_ranges[[time_range]][1],
sensorize_time_ranges[[time_range]][2],
x)
})))
plt = ggplot(df_cor, aes(x = time_value, y = value)) +
geom_line(aes(color = Indicator)) +
labs(title = sprintf("Correlation between %s and %s",
pretty_names[ind_idx],
target_names[ind_idx]),
subtitle = "Per day",
x = "Date", y = "Correlation") +
theme(legend.position = "bottom")
print(plt)
}
Ideally, we want to measure “how much” heterogeneity there is in the data, i.e., a statistic for heterogeneity; or even whether there is “enough” heterogeneity that we can say the relationship is heterogeneous (test statistic). More broadly, we would like to be able to say whether there is “more or less” heterogeneity in one indicator vs another.
Here we plot distribution of slopes taken from the sensorization relationships for each of the indicator-target pairs. For brevity, we only display the plots for TS -7:-1. The full set of plots may be found here. Note also that we restrict the vertical limits to the 0.01 and 0.99 quantile of fitted slope values. This is to prevent a handful of outliers (presumably due to unstable fits when there is little data available in a time window) from overtaking the scale of the plot.
We see, concerningly, that the distributions are centered on zero, after we have conditioned on time. A natural question to ask is whether the zero-centering is also true if we instead condition on location; the answer is yes (this was confirmed by calculating the \(\text{Median} \pm \text{MAD}\) interval for each location and seeing whether it covered zero; it did in >99% of locations for the first three indicators (DV, FB) and in 93.6% of locations for Hospitalizations. This analysis may also be found in the previously mentioned notebook.
One possible hypothesis is that (assuming the indicator and target are smooth functions of time, which is roughly true given they have been subject to smoothing) the relationships over small time windows are well-modeled by constant functions. One way to test this would be to sensorize using a model that only fits an intercept, and compare the correlations obtained from this sensorization against the correlations obtained from linear sensorization.
sensorize_time_ranges = list(
c(-7, -1),
c(-10, -1),
c(-14, -1),
c(-21, -1))
QUANTS = c(0.01, 0.99)
for (ind_idx in 1:length(source_names)) {
if (target_names[ind_idx] == 'Cases') {
df_target = df_cases
} else if (target_names[ind_idx] == 'Deaths') {
df_target = df_deaths
} else {
stop(sprintf("No matching dataframe for target %s.", target_names[ind_idx]))
}
base_cor_fname = sprintf('results/03_base_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
sensorize_fname = sprintf('results/03_sensorize_cors_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
sensorize_val_fname = sprintf('results/03_sensorize_vals_%s_%s.RDS',
source_names[ind_idx], signal_names[ind_idx])
df_cor_base = readRDS(base_cor_fname)
sensorize_cors = readRDS(sensorize_fname)
sensorized_vals = readRDS(sensorize_val_fname)
inner_idx = 1
sv = sensorized_vals[[inner_idx]]
slope_limits <- quantile(sv$slope, QUANTS, na.rm=TRUE)
plt = ggplot(
sensorized_vals[[inner_idx]],
aes(x=time_value,
y=slope),
) + geom_point (
alpha=0.1,
size=0.5,
) + geom_hline (
yintercept=0,
colour='white',
) + stat_summary (
aes(y=slope,
group=1,
colour='median'),
fun=median,
geom="line",
group=1,
) + stat_summary (
aes(y=slope,
group=1,
colour='+/- mad'),
fun=function(x) { median(x) + mad(x) },
geom="line",
group=1,
) + stat_summary (
aes(y=slope,
group=1,
colour='+/- mad'),
fun=function(x) { median(x) - mad(x) },
geom="line",
group=1,
) + scale_colour_manual(
values=c("median"="maroon",
"+/- mad"="darkgreen")
) + labs(
colour=''
) + ggtitle(
sprintf("Slope distribution for %s, fitted on t in %d:%d",
pretty_names[ind_idx],
sensorize_time_ranges[[inner_idx]][1],
sensorize_time_ranges[[inner_idx]][2])
) + ylim (
slope_limits[[1]], slope_limits[[2]]
)
print(plt)
}